library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(readr)
library(viridis)
## Loading required package: viridisLite
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
load("../data/25x/Coleps_viridis/Morph_mvt.RData")
dd25 <- morph_mvt
load("../data/25x/Coleps_irchel/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Didinium/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Paramecium_bursaria/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Paramecium_caudatum/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
# load("../data/25x/Rotifer/Morph_mvt.RData")
# dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Dexiostoma/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Loxocephallus/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd25 <- rbind(dd25, morph_mvt)
load("../data/25x/Cryptomonas/Morph_mvt.RData")
morph_mvt$species <- "Cryptomonas"
dd25 <- rbind(dd25, morph_mvt)
# load("../data/25x/Tetrahymena_mixed//Morph_mvt.RData")
# morph_mvt$species <- "Tetrahymena"
# morph_mvt <- morph_mvt %>% filter(mean_area<1000)
# dd25 <- full_join(dd25, morph_mvt)
load("../data/25x/Debris/Morph_mvt.RData")
morph_mvt$species <- "Debris_and_other"
morph_mvt <- morph_mvt %>% filter(mean_area<700)
dd25 <- full_join(dd25, morph_mvt)
## Joining, by = c("file", "mean_grey", "sd_grey", "mean_area", "sd_area", "mean_perimeter", "sd_perimeter", "mean_major", "sd_major", "mean_minor", "sd_minor", "mean_ar", "sd_ar", "mean_turning", "sd_turning", "duration", "N_frames", "max_net", "net_disp", "net_speed", "gross_disp", "gross_speed", "max_step", "min_step", "sd_step", "sd_gross_speed", "max_gross_speed", "min_gross_speed", "id", "date", "species", "sample", "video")
## filtering
dd25 <- dd25 %>%
filter(net_disp>50, net_speed>5)%>%
select(-edible_algae,-microcosm.nr)
dd25$magnification <- 2.5
species <- unique(dd25$species)
species <- species[!is.element(species,c("Debris_and_other","Cryptomonas"))]
compositions <- read_csv("../../compositions.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## composition = col_character()
## )
## See spec(...) for full column specifications.
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- paste0("c_",c(paste0(0,1:9),10:16))
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
trainingdata <- dd25[complete.cases(dd25), ]
set.seed(624)
trainingdata$species <- factor(trainingdata$species)
split_up <- split(trainingdata, f = trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
p = 0.7, # percentage of data as training
list = FALSE)
testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]
obj <- tune(svm, formula, data = trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=testdata %>% select(-species)),
testdata$species)
classifier_plot_svm(table = cf$table,
combination.nr = "all")
classifier_assessment_plot(cf = cf,
combination.nr = "all")
There are mainly 4 measures:
Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)
Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)
Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)
Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)
PPV is the most important for us!
formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
sd_major + mean_ar + sd_ar +
mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed
set.seed(62)
classifiers_18c_25x <- list()
classifiers_18c_25x_plots <- list()
classifiers_18c_25x_assess_plots <- list()
trainingdata <- dd25[complete.cases(dd25), ]
for(i in 1:length(compositions_list)){
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"Cryptomonas","Debris_and_other"))
n.var <- length(unique(sub_trainingdata$species))
sub_trainingdata$species <- factor(sub_trainingdata$species)
split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
sub_trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(sub_trainingdata$species,
p = 0.7, # percentage of data as training
list = FALSE)
sub_testdata <- sub_trainingdata[-idx_train,]
sub_trainingdata <- sub_trainingdata[idx_train,]
n.min <- min(table(sub_trainingdata$species))
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c_25x[[i]] <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
classifiers_18c_25x_plots[[i]] <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_25x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
names(classifiers_18c_25x_plots) <- names(classifiers_18c_25x) <-
names(classifiers_18c_25x_assess_plots) <- paste0("c_",c(paste0(0,1:9),10:16))
library(patchwork)
for(i in 1:16){
print(classifiers_18c_25x_plots[[i]] + classifiers_18c_25x_assess_plots[[i]] +
plot_layout(widths = c(4,2)))
}
saveRDS(classifiers_18c_25x, "svm_video_classifiers_18c_25x.rds")
# cl <- readRDS("classifiers_18c_25x.rds")